MONet Community Science Meeting 2023

This RMarkdown document contains code and scripts for preliminary processing and data visualization of soil biogeochemistry data. These graphs were created for use in Session 3 (Biogeochemistry Data Tutorial) on Tuesday, November 7.

Setup

Load packages

library(tidyverse) # for general data wrangling and visualization
library(maptools); library(sf) # for maps
library(ggthemes) # for theme_map
library(ggcorrplot) # to plot correlation matrix
library(soilpalettes) # for color palettes
# Install `soilpalettes` using: install.packages("devtools"); devtools::install_github("kaizadp/soilpalettes")

theme_set(theme_bw(base_size = 12)) # set default ggplot theme

Import files

bgc_data = read_csv("Biogeochem/data/bgc_data.csv")
metadata = read_csv("Biogeochem/data/bgc_metadata.csv")
bgc_analyses = read_csv("Biogeochem/data/bgc_analyses.csv") %>% dplyr::select(column, abbreviated, analysis_type)

Process the data. We will use the data in two forms - longform and wideform.

data_long = 
  bgc_data %>% 
  pivot_longer(cols = -c(Site_Code, Location), names_to = "column") %>% 
  left_join(bgc_analyses) %>% 
  left_join(metadata %>% dplyr::select(Site_Code, Lat, Long, biome_name))

data_wide = 
  data_long %>% 
  dplyr::select(-c(column, analysis_type)) %>% 
  pivot_wider(names_from = "abbreviated", values_from = "value") %>% 
  mutate(Location = factor(Location, levels = c("TOP", "BTM")))

BASIC EXPLORATION

data_long %>% 
  ggplot(aes(x = abbreviated, y = value, color = Location))+
  geom_jitter(width = 0.3)+
  facet_wrap(~analysis_type, scales = "free")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The samples were collected from sites spread across five biomes

## Create summary of biome counts
biome_numbers = 
  data_long %>% 
  distinct(Site_Code, biome_name) %>% 
  group_by(biome_name) %>% 
  dplyr::summarise(n = n()) %>% 
  force()

biome_numbers %>% knitr::kable()
biome_name n
Deserts & Xeric Shrublands 13
Mediterranean Forests, Woodlands & Scrub 1
Temperate Broadleaf & Mixed Forests 21
Temperate Conifer Forests 19
Temperate Grasslands, Savannas & Shrublands 11
NA 5
# biome map

make_map_biome = function(bgc_wide, VAR, TITLE = "", LEGEND = VAR){
  
  ## Set CRS
  common_crs <- 4326
  
  ## Set map size and point size
  point_size <- 2
  map_width = 9
  map_height = 6
  
  # Set up map layers for plotting
  
  ## Make US states map cropped to contiguous region
  us <- read_sf("Biogeochem/cb_2018_us_state_5m/cb_2018_us_state_5m.shp") %>% 
    st_transform(., crs = common_crs) 
  
  us_bbox <- c(xmin = -125, xmax = -60, ymin = 20, ymax = 50)
  region <- st_crop(us, y = us_bbox)
  
  # make a dataset merging metadata with site lat-longs
  df_map <- 
    bgc_wide %>% 
    filter(!is.na(Lat) & !is.na(Long)) %>% 
    st_as_sf(., coords = c("Long", "Lat"), crs = common_crs)
  
  # Make the base map with all sites
  ## base_plot <- 
  ggplot() + 
    geom_sf(data = region, fill = "white", color = "grey70") + 
    geom_sf(data = df_map, aes_string(fill = VAR), size = 7, shape = 21, color = "black", stroke = 1) + 
    labs(title = TITLE,
         fill = LEGEND)+
    ggthemes::theme_map() + 
    theme(legend.background = element_rect(fill = alpha("white", 0.0)), 
          legend.key = element_rect(fill = "transparent"), 
          legend.key.size = unit(1, "cm"),
          legend.text = element_text(size = 12),
          legend.title = element_text(size = 12, face = "bold", vjust = 0.7),
          legend.position = "right",
          plot.title = element_text(size = 14, face = "bold"),
          plot.background = element_rect(color = "black", fill = NA, linewidth = 1)) + 
#    scale_color_viridis_c(option = "plasma") +
    scale_fill_manual(values = soilpalettes::soil_palette("redox2", 5))+
 NULL
  
}
make_map_biome(data_wide, VAR = "biome_name")


CORRELATIONS AND PCAs

Use this function to set up the correlation plots for the entire dataset.
We use the ggcorrplot package for this.

  fit_correlations_function = function(dat, TITLE = ""){
    num = 
      dat %>%       
      dplyr::select(where(is.numeric)) %>%
      drop_na()
    
    num_clean = 
      num %>% 
      rownames_to_column("row") %>% 
      pivot_longer(-row) %>% 
      separate(name, sep = "_", into = c("name")) %>% 
      pivot_wider() %>% 
      dplyr::select(-row)
    
    
    m = cor(num_clean)
    p.mat <- ggcorrplot::cor_pmat(num_clean)
    
    ggcorrplot::ggcorrplot(m, type = "lower",
                           p.mat = p.mat,
                           outline.color = "black",
                           #   lab = TRUE, 
                           insig = "blank",
                           colors = c("#E46726", "white", "#6D9EC1"),
                           title = TITLE)
    
  }
# this will generate a correlation matrix for the entire BGC dataset
  corr_all = fit_correlations_function(data_wide %>% dplyr::select(-Lat, -Long, -GWC))

# However, there are a lot of variables here. 
# So, for an easier plot, subset select variables and then re-run the correlation matrix.

data_wide_subset = data_wide %>% 
  dplyr::select(Ca, Mg, Na, K, Bases, CEC, 
                Sand, Silt, Clay, SO4S, NO3N, NH4N,
                TC, TN, pH, Lat, Long, Location)

fit_correlations_function(data_wide_subset %>% filter(Location == "TOP"))

Individual Correlations

data_wide %>% 
  ggplot(aes(x = TC, y = TN))+
  geom_point()+
  geom_smooth(method = "lm", se = F)+
  labs(x = "Total C (%)",
       y = "Total N (%)")

data_wide %>% 
  ggplot(aes(x = Clay, y = Sand))+
  geom_point()+
  geom_smooth(method = "lm", se = F)+
  labs(x = "Clay (%)",
       y = "Sand (%)")

data_wide %>% 
  ggplot(aes(y = CEC, x = Clay))+
  geom_point()+
  geom_smooth(method = "lm", se = F)+
  labs(y = "Cation Exchange Capacity (meq/100g)",
       x = "Clay (%)")

data_wide %>% 
  ggplot(aes(x = Ca, y = Bases))+
  geom_point()+
  geom_smooth(method = "lm", se = F)+
  labs(y = "Total bases (meq/100g)",
       x = "Calcium (meq/100g)")

data_wide %>% 
  ggplot(aes(x = TC, y = WEOC))+
  geom_point()+
  geom_smooth(method = "lm", se = F)+
  labs(x = "Total carbon (%)",
       y = "Water-extractable organic carbon (mg/g)")


JITTER PLOTS

These are “jittered” scatter plots showing distribution of the variables by depth or by biome.
Also included is code to calculate the analysis of variance (ANOVA)

1. total carbon (TC)

TC ANOVA

lm((TC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
## 
## Response: (TC)
##                     Sum Sq  Df F value   Pr(>F)    
## Location            151.92   1 24.2377  2.9e-06 ***
## biome_name           92.18   4  3.6768 0.007448 ** 
## Location:biome_name  61.48   4  2.4521 0.049940 *  
## Residuals           714.55 114                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot TC by depth

data_wide %>% 
  ggplot(aes(x = Location, y = TC, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
  geom_jitter(size = 3, width = 0.3)

plot TC by biome

set.seed(1234)
data_wide %>% 
  ggplot(aes(x = biome_name, y = TC, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
  geom_point(size = 3,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  labs(x = "")

2. CEC

CEC ANOVA

lm((CEC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
## 
## Response: (CEC)
##                      Sum Sq  Df F value    Pr(>F)    
## Location              100.0   1  0.7992    0.3732    
## biome_name           3283.7   4  6.5619 8.661e-05 ***
## Location:biome_name   466.0   4  0.9311    0.4486    
## Residuals           14261.9 114                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot CEC by depth

data_wide %>% 
  ggplot(aes(x = Location, y = CEC, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
  geom_jitter(size = 3, width = 0.3)+
  labs(x = "")

plot CEC by biome

set.seed(1234)
data_wide %>% 
  ggplot(aes(x = biome_name, y = CEC, color = Location))+
  #geom_jitter(size = 3, width = 0.2)+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
  geom_point(size = 3,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  labs(x = "")

3. Clay

Clay ANOVA

lm((Clay) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
## 
## Response: (Clay)
##                      Sum Sq  Df F value    Pr(>F)    
## Location              483.7   1  5.2195   0.02412 *  
## biome_name           2574.6   4  6.9449 4.686e-05 ***
## Location:biome_name    88.4   4  0.2385   0.91607    
## Residuals           10936.1 118                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot Clay by depth

data_wide %>% 
  ggplot(aes(x = Location, y = Clay, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
  geom_jitter(size = 3, width = 0.3)+
  labs(x = "")

plot Clay by biome

set.seed(1234)
data_wide %>% 
  ggplot(aes(x = biome_name, y = Clay, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
  geom_point(size = 3,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  labs(x = "")

4. WEOC

WEOC ANOVA

lm((WEOC) ~ Location * biome_name, data = data_wide) %>% car::Anova()
## Anova Table (Type II tests)
## 
## Response: (WEOC)
##                      Sum Sq  Df F value    Pr(>F)    
## Location            0.16630   1 12.7321 0.0005222 ***
## biome_name          0.14170   4  2.7122 0.0333182 *  
## Location:biome_name 0.05427   4  1.0387 0.3903703    
## Residuals           1.52822 117                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot WEOC by depth

data_wide %>% 
  ggplot(aes(x = Location, y = WEOC, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3, outlier.colour = NA)+
  geom_jitter(size = 3, width = 0.3)+
  labs(x = "")

plot WEOC by biome

set.seed(1234)
data_wide %>% 
  ggplot(aes(x = biome_name, y = WEOC, color = Location))+
  geom_boxplot(position = position_dodge(width = 0.5), width = 0.3)+
  geom_point(size = 3,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.5))+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  labs(x = "")


MAPS

This section has code to plot variables in a spatial manner on maps.
The easiest way to do this is to run the custom make_map function first, which will pre-load the map and other parameters.
You then select the variable you want to plot below.

make_map = function(bgc_wide, VAR, TITLE = "", LEGEND = VAR){
  
  ## Set CRS
  common_crs <- 4326
  
  ## Set map size and point size
  point_size <- 2
  map_width = 9
  map_height = 6
  
  # Set up map layers for plotting
  
  ## Make US states map cropped to contiguous region
  us <- read_sf("Biogeochem/cb_2018_us_state_5m/cb_2018_us_state_5m.shp") %>% 
    st_transform(., crs = common_crs) 
  
  us_bbox <- c(xmin = -125, xmax = -60, ymin = 20, ymax = 50)
  region <- st_crop(us, y = us_bbox)
  
  # make a dataset merging metadata with site lat-longs
  df_map <- 
    bgc_wide %>% 
    filter(!is.na(Lat) & !is.na(Long)) %>% 
    st_as_sf(., coords = c("Long", "Lat"), crs = common_crs)
  
  # Make the base map with all sites
  ## base_plot <- 
  ggplot() + 
    geom_sf(data = region, fill = "white", color = "grey70") + 
    geom_sf(data = df_map, aes_string(fill = VAR), size = 7, shape = 21, color = "black", stroke = 1) + 
    labs(title = TITLE,
         fill = LEGEND)+
    ggthemes::theme_map() + 
    theme(legend.background = element_rect(fill = alpha("white", 0.0)), 
          legend.key = element_rect(fill = "transparent"), 
          legend.key.size = unit(1, "cm"),
          legend.text = element_text(size = 12),
          legend.title = element_text(size = 12, face = "bold", vjust = 0.7),
          legend.position = "bottom",
          plot.title = element_text(size = 14, face = "bold"),
          plot.background = element_rect(color = "black", fill = NA, linewidth = 1)) + 
#    scale_color_viridis_c(option = "plasma") +
    scale_fill_gradientn(colors = soilpalettes::soil_palette("redox2", 5))
  
}

Now, make the maps

make_map(data_wide %>% filter(Location == "TOP"), VAR = "TC", LEGEND = "TC (%)")

make_map(data_wide %>% filter(Location == "TOP"), VAR = "WEOC", LEGEND = "WEOC (mg/g)")

make_map(data_wide %>% filter(Location == "TOP"), VAR = "MBC", LEGEND = "MBC (mg/g)")

make_map(data_wide %>% filter(Location == "TOP"), VAR = "TN", LEGEND = "TN (%)")

make_map(data_wide %>% filter(Location == "BTM"), VAR = "CEC", LEGEND = "CEC (meq/100g)")

make_map(data_wide %>% filter(Location == "TOP"), VAR = "Clay", LEGEND = "Clay (%)")